data(Golub_Merge)
df <- data.frame(Golub_Merge)[1:7129] %>%
decostand(method="log", MARGIN = 2)
df_rows <- rownames(df) %>% as.numeric()
#group vector to check
env <- Golub_Merge$ALL.AML
env1 <- Golub_Merge$T.B.cell
col1 <- c("red", "blue")[env %>% match(unique(env))]
col2 <- c("green", "violet")[env1 %>% match(unique(env1))]
env
## [1] ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL
## [20] ALL AML AML AML AML AML AML AML AML AML AML AML AML AML AML ALL ALL ALL ALL
## [39] ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL
## [58] ALL ALL ALL ALL AML AML AML AML AML AML AML AML AML AML AML
## Levels: ALL AML
A little bit normalization would be better, but data looks wierd to be honest, and pre-normalized
df_var <- (df %>% apply(2, sd)) / abs(df %>% apply(2, mean))
df_var <- log10(df_var + 1)
ggplot() +
geom_density(aes(df_var)) +
geom_boxplot(aes(df_var, 0), outliers = F) +
geom_vline(aes(xintercept = .15), color = "red") +
geom_vline(aes(xintercept = .15), color = "red") +
xlab("log 10 of sd / mean of the data") +
theme_minimal()
This step does not make any difference, except faster working script
#df <- df[, df_var > .4]
dist_mtd <- c("euclidean", "maximum", "manhattan",
"canberra", "binary", "minkowski")
dist_list <- list()
heatmap_custom <- function(dist_mat, mtd) {
dist_mat %>%
as.matrix %>%
heatmap(ColSideColors = col1,
RowSideColors = col2,
main = mtd,
scale = "none")
}
for (i in 1:length(dist_mtd)){
dist_list[[i]] <- df %>% dist(method = dist_mtd[i])
heatmap_custom(dist_list[[i]], dist_mtd[i])
}
clust <- function (DM, mtd) {
DM %>% hclust(method = mtd)
}
wrap_clust <- function (DM, mtd) {
cop_clust <- DM %>% clust(mtd) %>% cophenetic()
c(mtd, cor(DM, cop_clust))
}
clust_mtd <- c(
"ward.D", "ward.D2", "single", "complete",
"average", "mcquitty", "median", "centroid"
)
clust_list <- data.frame("dist"=NA, "clust"=NA, "cof"=NA)
for (i in clust_mtd) {
for (j in 1:length(dist_mtd)) {
clust_list <- rbind (clust_list,
c(dist_mtd[j], wrap_clust(dist_list[[j]], i)))
}
}
clust_list <- clust_list[-1,] %>%
mutate(cof = as.numeric(cof))
gg <- ggplot() +
geom_point(data = clust_list,
aes(dist, clust,
size = cof, color = cof), show.legend = F) +
geom_text(data = clust_list %>% subset(cof > 0.8),
aes(dist, clust,
label = round(cof, 2)),
size = 2, color = "white") +
scale_color_viridis_c("Cophenetic correlation", direction = -1) +
scale_size_continuous("Cophenetic correlation") +
theme_minimal()
ggplotly(gg)%>%
style(showlegend = FALSE)
We have a lot of good cluster variants. Let’s check several
We can see a lot of clusters with not very big differences between groups, maybe there is another clustering feature.
gg <- df %>%
dist(method = "maximum") %>%
hclust(method = "centroid")
xpos <- gg$labels %>% as.numeric
cols2 <- env[xpos%>% match(1:72)]
shps2 <- env1[xpos%>% match(1:72)] %>% as.character
shps2[is.na(shps2)] <- ""
col_tmp <- data.frame(x = xpos, y = -.00003, col = cols2, shape = shps2)
gg %>%
as.dendrogram() %>%
dendro_data %>%
segment %>%
ggplot() +
geom_segment(aes(x=x, y=y, xend=xend, yend=yend)) +
geom_point(data = col_tmp, aes(x, y, color = col, shape = shape), size = .9) +
theme_dendro() +
scale_color_viridis_d(name = "Type") +
scale_shape_discrete(name = "Cell")
gg <- df %>%
dist(method = "binary") %>%
hclust(method = "average")
xpos <- gg$labels %>% as.numeric
cols2 <- env[xpos%>% match(1:72)]
shps2 <- env1[xpos%>% match(1:72)] %>% as.character
shps2[is.na(shps2)] <- ""
col_tmp <- data.frame(x = xpos, y = -.00003, col = cols2, shape = shps2)
gg %>%
as.dendrogram() %>%
dendro_data %>%
segment %>%
ggplot() +
geom_segment(aes(x=x, y=y, xend=xend, yend=yend)) +
geom_point(data = col_tmp, aes(x, y, color = col, shape = shape), size = .9) +
theme_dendro() +
scale_color_viridis_d(name = "Type") +
scale_shape_discrete(name = "Cell")
gg <- df %>%
dist(method = "maximum") %>%
hclust(method = "average")
xpos <- gg$labels %>% as.numeric
cols2 <- env[xpos%>% match(1:72)]
shps2 <- env1[xpos%>% match(1:72)] %>% as.character
shps2[is.na(shps2)] <- ""
col_tmp <- data.frame(x = xpos, y = -.00003, col = cols2, shape = shps2)
gg %>%
as.dendrogram() %>%
dendro_data %>%
segment %>%
ggplot() +
geom_segment(aes(x=x, y=y, xend=xend, yend=yend)) +
geom_point(data = col_tmp, aes(x, y, color = col, shape = shape), size = .9) +
theme_dendro() +
scale_color_viridis_d(name = "Type") +
scale_shape_discrete(name = "Cell")
gg2 <- df %>%
t %>%
pvclust(
nboot = 50,
method.dist = "maximum",
method.hclust = "average",
parallel = T)
## Creating a temporary cluster...done:
## кластер-приемник с 7 узлами на хосте 'localhost'
## Multiscale bootstrap... Done.
gg2$hclust$labels <- env
gg2 %>%
plot(cex = .5)
gg3 <- df %>%
t %>%
pvclust(
nboot = 50,
method.dist = "binary",
method.hclust = "average",
parallel = T)
## Creating a temporary cluster...done:
## кластер-приемник с 7 узлами на хосте 'localhost'
## Multiscale bootstrap... Done.
gg3$hclust$labels <- env
plot(gg3)
In general, hierarchical methods can not suit this data well. It could be improved using different de-noizing techniques, or methods for dimension reduction.
Let’s make some meaningful plot with cluster viz:
gg4 <- df %>%
apply(2, scale) %>%
prcomp
gr <- gg %>% cutree(5) %>% as.character
env1 <- as.character(env1)
env1[is.na(env1)] <- ""
gg5 <- gg4[["x"]] %>%
ggplot(aes(PC1, PC2)) +
geom_point(aes(color = gr, shape = env)) +
theme_void()
ggplotly(gg5)
OK, at least we can see there probably that segregation happens above - on 4 not really much different clusters with a gradient.